# R Options
options(stringsAsFactors=FALSE, "citation_format"="pandoc")

# Required libraries
library(Seurat) # main
library(ggplot2) # plots
library(patchwork) # combination of plots
library(magrittr) # %>% operator
library(reticulate) # required for 'leiden' clustering
library(enrichR) # functional enrichment
library(future) # multicore support for Seurat
plan("multiprocess") # by default, this uses all available cores; use "workers=4" for example, to use 4 cores
options(future.globals.maxSize = 800000000) # increase if the maximum allowed size in bytes of objects exported by future is exceeded

# Other libraries we use
# Knit: knitr
# Data handling: dplyr, tidyr, purrr, stringr, Matrix
# Tables: kableExtra
# Plots: ggsci, ggpubr
# IO: openxlsx, readr, R.utils
# Annotation: biomaRt
# DEG: mast
# Functional enrichment: enrichR
# Other: sessioninfo

# Knitr default options
knitr::opts_chunk$set(echo=TRUE, cache=FALSE, message=FALSE, warning=FALSE, tidy=FALSE, fig.width=10)

# Python needed for clustering, umap, other python packages
# Change location if necessary
Sys.setenv(RETICULATE_PYTHON = "/usr/bin/python")


# biomaRt mirror: NULL means the first working mirror is used
mirror = NULL

Dataset description

Project-specific parameters

This code chunk contains all parameters that are set specifically for the project.

param = list()

# Project ID
param$project = "pbmc"

# Input data path in case Cell Ranger was run 
param$path_data = data.frame(name=c("pbmc_10x","pbmc_smartseq2"),
                             type=c("10x","smartseq2"),
                             path=c("test_datasets/10x_SmartSeq2_pbmc_GSE132044/counts/10x/","test_datasets/10x_SmartSeq2_pbmc_GSE132044/counts/smartseq2/counts_table.tsv.gz"))
param$file_mapping_stats = NULL

# Project-specific paths
param$path_out = "test_datasets/10x_SmartSeq2_pbmc_GSE132044/results/"
if (!file.exists(param$path_out)) dir.create(param$path_out, recursive=TRUE, showWarnings=FALSE)

# Marker genes based on literature, translated to Ensembl IDs
# xlsx file, one list per column, first row as header and Ensembl IDs below
# Set to NULL if no known marker genes should be plotted
param$file_known_markers = "test_datasets/10x_pbmc_1k_healthyDonor_v3Chemistry/known_markers.xlsx"

# Annotation via biomaRt
param$mart_dataset = "hsapiens_gene_ensembl"
param$annot_version = 98
param$annot_main = c(ensembl="ensembl_gene_id", symbol="external_gene_name", entrez="external_gene_name")
param$file_annot = NULL
if (is.null(param$file_annot)) {
  param$file_annot = file.path(param$path_out, paste0(param$mart_dataset, ".v", param$annot_version, ".annot.txt"))
}
param$mart_attributes = c(param$annot_main, 
                          c("chromosome_name", "start_position", "end_position", 
                            "percentage_gene_gc_content", "gene_biotype", "strand", "description"))

# Prefix of mitochondrial genes 
param$mt = "^MT-"

# Filters
param$cell_filter = list(nFeature_RNA=c(200,NA), percent_mt=c(NA,20))
param$feature_filter = list(min_counts=1, min_cells=3) # feature has to be found by at least one count in one cell
param$samples_to_drop = c("NC", "RNA") # cells from these samples will be dropped after initial QC

# Whether or not to remove cell cycle effects
param$cc_remove = FALSE

# Should all cell cycle effects be removed, or only the difference between profilerating cells (G2M and S phase)?
# Read https://satijalab.org/seurat/v3.1/cell_cycle_vignette.html, for an explanation
param$cc_remove_all = FALSE

# Additional (unwanted) variables that will be regressed out for visualisation and clustering
param$vars_to_regress = c()

# Additional (unwanted) variables to be included in the statistical tests
param$latent_vars = NULL

# When there are multiple datasets, how to integrate them:
#   - method:
#     - merge: Just merge them since no integration is needed (e.g. samples were multiplexed on the same chip)
#     - standard: Anchors are computed for all pairs of datasets. This will give all datasets the same weight during dataset integration but can be computationally intensive.
#     - reference: One dataset is used as reference and anchors are computed for all other datasets. Less accurate but computationally faster. 
#     - reciprocal: Anchors are computed in PCA space instead of the data. Even less accurate but for very big datasets.
#
#   - reference_dataset: When using method "reference", which dataset is the reference? Can be numeric or name of the dataset.
#   - dimensions: Number of dimensions to consider for integration
param$integrate_samples = list(method="standard", reference_dataset=1, dimensions=30)

# Which normalisation should be used for analysis: RNA or SCT?
param$normalisation_default = "SCT"

# The number of PCs to use; adjust this parameter based on the Elbowplot 
param$pc_n = 10

# Resolution of clusters; low values will lead to fewer clusters of cells 
param$cluster_resolution=0.5

# Thresholds to define differentially expressed genes 
param$padj = 0.05
param$log2fc = log2(1.5)

# Enrichr databases of interest
param$enrichr_dbs = c("GO_Molecular_Function_2018", "GO_Biological_Process_2018", "GO_Cellular_Component_2018")

# Main colour(s) to use for plots
param$col = "palevioletred"

# Colour palette and colours used for samples
param$col_palette_samples = ggsci::pal_jama

# Colour palette and colours used for cluster
param$col_palette_cluster = ggsci::pal_startrek

# Sample data to at most n cells per dataset/sample (mainly for tests); set to NULL to deactivate
param$sample_cells = NULL

# Path to git repository
param$path_to_git = "."

Read data

Read gene annotation

If not provided already, we read gene annotation from Ensembl and write the resulting table to file. We generate several dictionaries to translate between Ensembl IDs, gene symbols, Entrez Ids, and Seurat rownames.

# Read annotation from csv or from Ensembl and a tab separated txt will be created
if (file.exists(param$file_annot)) {
  annot_ensembl = read.delim(param$file_annot)
} else {
  annot_mart = biomaRt::useEnsembl("ensembl", dataset=param$mart_dataset, mirror=mirror, version=param$annot_version)
  annot_ensembl = biomaRt::getBM(mart=annot_mart, attributes=param$mart_attributes)
  write.table(annot_ensembl, file=param$file_annot, sep='\t', col.names=TRUE, row.names=FALSE, append=FALSE)
  message("Gene annotation file was created at: ", param$file_annot)
  # Note: depending on the attributes, there might be more than one row per gene
}

# Double-check if we got all required annotation, in case annotation file was read
check_annot_main = all(param$annot_main %in% colnames(annot_ensembl))
if (!check_annot_main) {
  stop("The annotation table misses at least one of the following columns: ", paste(param$annot_main, collapse=", "))
}

# Create translation tables
ensembl = param$annot_main["ensembl"]
symbol = param$annot_main["symbol"]
entrez = param$annot_main["entrez"]

# Ensembl id to gene symbol
ensembl_to_symbol = unique(annot_ensembl[, c(ensembl, symbol)])
ensembl_to_symbol = setNames(ensembl_to_symbol[, symbol], ensembl_to_symbol[, ensembl])

# Ensembl id to seurat-compatible unique rowname
ensembl_to_seurat_rowname = unique(annot_ensembl[, c(ensembl, symbol)])
ensembl_to_seurat_rowname[, symbol] = make.unique(gsub(pattern="_", replacement="-", x=ensembl_to_seurat_rowname[, symbol], fixed=TRUE))
ensembl_to_seurat_rowname = setNames(ensembl_to_seurat_rowname[, symbol], ensembl_to_seurat_rowname[, ensembl])

# Seurat-compatible unique rowname to ensembl id
seurat_rowname_to_ensembl = setNames(names(ensembl_to_seurat_rowname), ensembl_to_seurat_rowname)

# Gene symbol to ensembl id: named LIST to account for genes where one symbol translates to multiple Ensembl IDs
symbol_to_ensembl_df = unique(annot_ensembl[, c(ensembl, symbol)])
symbol_to_ensembl = split(symbol_to_ensembl_df[, ensembl], symbol_to_ensembl_df[, symbol])

# Gene symbol to (seurat compatible unique) gene symbol: named LIST to account for genes with multiple names
symbol_to_seurat_rowname = unique(annot_ensembl[, c(ensembl, symbol)])
symbol_to_seurat_rowname$seurat_rowname = ensembl_to_seurat_rowname[symbol_to_seurat_rowname[, ensembl]]
symbol_to_seurat_rowname = split(symbol_to_seurat_rowname$seurat_rowname, symbol_to_seurat_rowname[, symbol])

# Ensembl to Entrez
ensembl_to_entrez = unique(annot_ensembl[, c(ensembl, entrez)])
ensembl_to_entrez[, entrez] = ifelse(nchar(ensembl_to_entrez[, entrez]) == 0, NA, ensembl_to_entrez[, entrez])
ensembl_to_entrez = split(ensembl_to_entrez[, entrez], ensembl_to_entrez[, ensembl])

# Seurat-compatible unique rowname to Entrez
seurat_rowname_to_ensembl_match = match(seurat_rowname_to_ensembl, names(ensembl_to_entrez))
names(seurat_rowname_to_ensembl_match) = names(seurat_rowname_to_ensembl)
seurat_rowname_to_entrez = purrr::map(seurat_rowname_to_ensembl_match, function(i) {unname(ensembl_to_entrez[[i]])})

# Entrez IDs is duplicating Ensembl IDs in annot_ensembl
# Therefore, we remove Entrez IDs from the annotation table, after generating all required translation tables
# Set rownames of annotation table to Ensembl identifiers
annot_ensembl = annot_ensembl[, -match(entrez, colnames(annot_ensembl))] %>% unique() %>% as.data.frame()
rownames(annot_ensembl) = annot_ensembl[, ensembl]

Read scRNA-seq data

We next read the scRNA-seq dataset(s) into Seurat.

# List of Seurat objects
sc = list()

datasets = param$path_data
for (i in seq(nrow(datasets))) {
  name = datasets[i,"name"]
  type = datasets[i,"type"]
  path = datasets[i,"path"]
  
  # Read 10X or smartseq2
  if (type == "10x") {
    
    # Read 10X sparse matrix into a Seurat object
    sc[[name]] = ReadSparseMatrix(path, 
                                  project=name, 
                                  row_name_column=1, 
                                  convert_row_names=ensembl_to_seurat_rowname)
    
  } else if (type == "smartseq2") {
    
    # Read counts table into a Seurat object
    sc = c(sc, ReadCountsTable(path, project=name, row_name_column=1, convert_row_names=ensembl_to_seurat_rowname, parse_plate_information=TRUE, return_samples_as_datasets=TRUE))
  } 
}

# Check that sample names are unique between datasets: this can happen if multiple Smartseq2 datasets have a negative control (NC)
# In this case, just use the dataset name which is unique, e.g. set1.NC and set2.NC
sample_names = purrr::map(sc, function(s){ unique(as.character(s[[]][["orig.ident"]])) })
duplicate_sample_names = purrr::reduce(sample_names, intersect)
sc = purrr::map(list_names(sc), function(n) {
  if (any(sc[[n]][["orig.ident", drop=TRUE]] %in% duplicate_sample_names)) {
    sc[[n]][["orig.ident.old"]] = sc[[n]][["orig.ident"]]
    sc[[n]][["orig.ident"]] = n
    sc[[n]] = Seurat::SetIdent(sc[[n]], value="orig.ident")
  }
  return(sc[[n]])
})

# Enforce unique barcodes per cell after integration
if (length(sc) > 1) {
  for (i in 1:length(sc)) {
    # Remove "-1" if necessary
    sc[[i]] = Seurat::RenameCells(sc[[i]], new.names=gsub("-1", "", Cells(sc[[i]])))
    # Add "-1", "-2", and so on
    sc[[i]] = Seurat::RenameCells(sc[[i]], new.names=paste0(Cells(sc[[i]]), "-", i)) 
  }
}

# Set up colors for samples
sample_names = purrr::flatten_chr(purrr::map(sc, function(s){ unique(as.character(s[[]][["orig.ident"]])) }))
param$col_samples = GenerateColours(num_colours=length(sample_names), palette=param$col_palette_samples)
names(param$col_samples) = sample_names

# Sample cells if requested
if (!is.null(param$sample_cells)) {
  sc = purrr::map(sc, function(s) {
    cells = colnames(s)
    return(subset(s, cells=sample(cells, min(param$sample_cells, length(cells)))))
  })
}

Pre-processing

Quality control

We start the analysis by removing unwanted cells from the dataset(s). Three commonly used QC metrics include the number of unique genes detected in each cell (“nFeature_RNA”), the total number of molecules detected in each cell (“nCount_RNA”), and the percentage of counts that map to the mitochrondrial genome (“percent_mt”). If ERCC spike-in controls were used, the percentage of counts mapping to them is also shown (“percent_ercc”).

The following first table shows metadata (columns) of the first 5 cells (rows). These metadata provide additional information about the cells in the dataset, such as the sample a cell belongs to (“orig.ident”), or the above mentioned number of unique genes detected (“nFeature”). The second table shows metadata (columns) of the first 5 genes (rows), for example the number of cells with at least 1 count for the gene (“num_cells_expr”), and the number of cells with at least as many counts as set in the parameter filter section (“num_cells_expr_threshold”).

Cell metadata, top 5 rows
orig.ident nCount_RNA nFeature_RNA percent_mt
PBMC1_10x_AAACCCACACTTGGGC-1 pbmc_10x 7552 2037 8.659958
PBMC1_10x_AAACCCACAGGTGGAT-1 pbmc_10x 4773 1800 8.694741
PBMC1_10x_AAACCCAGTGCTTATG-1 pbmc_10x 4430 1565 14.762980
PBMC1_10x_AAACCCATCCGTAGTA-1 pbmc_10x 4512 1592 8.621454
PBMC1_10x_AAACCCATCTTACACT-1 pbmc_10x 6663 1919 8.314573
Feature metadata, top 5 rows (only first dataset shown)
feature_id feature_name feature_type num_cells_expr num_cells_expr_threshold
TSPAN6 ENSG00000000003 TSPAN6 Gene Expression 151 151
TNMD ENSG00000000005 TNMD Gene Expression 0 0
DPM1 ENSG00000000419 DPM1 Gene Expression 629 629
SCYL3 ENSG00000000457 SCYL3 Gene Expression 166 166
C1orf112 ENSG00000000460 C1orf112 Gene Expression 45 45

Filtering

The size of the Seurat object before filtering is:

## $pbmc_10x
## An object of class Seurat 
## 33694 features across 4033 samples within 1 assay 
## Active assay: RNA (33694 features, 0 variable features)
## 
## $pbmc_smartseq2.pbmc_smartseq2.sample1
## An object of class Seurat 
## 33694 features across 311 samples within 1 assay 
## Active assay: RNA (33694 features, 0 variable features)

Cells and genes are filtered based on the following thresholds:

Filters applied to cells
Min Max
pbmc_10x.nFeature_RNA 200 NA
pbmc_10x.percent_mt NA 20
pbmc_smartseq2.pbmc_smartseq2.sample1.nFeature_RNA 200 NA
pbmc_smartseq2.pbmc_smartseq2.sample1.percent_mt NA 20
Filters applied to genes
n
pbmc_10x.min_counts 1
pbmc_10x.min_cells 3
pbmc_smartseq2.pbmc_smartseq2.sample1.min_counts 1
pbmc_smartseq2.pbmc_smartseq2.sample1.min_cells 3

The number of excluded cells and features is as follows:

Number of excluded cells
nFeature_RNA percent_mt sample_to_drop
pbmc_10x 286 271 0
pbmc_smartseq2.pbmc_smartseq2.sample1 1 0 0
Number of excluded genes
pbmc_10x pbmc_smartseq2.pbmc_smartseq2.sample1
Genes 13893 15754

After filtering, the size of the Seurat object is:

## $pbmc_10x
## An object of class Seurat 
## 19801 features across 3608 samples within 1 assay 
## Active assay: RNA (19801 features, 0 variable features)
## 
## $pbmc_smartseq2.pbmc_smartseq2.sample1
## An object of class Seurat 
## 17940 features across 310 samples within 1 assay 
## Active assay: RNA (17940 features, 0 variable features)

Normalisation, cell cycle scoring, scaling and variable genes

In this section, we subsequently run a series of Seurat functions:
1. We start by running a standard log normalisation, where counts for each cell are divided by the total counts for that cell and multiplied by 10,000. This is then natural-log transformed.
2. Variable genes are selected. For downstream analysis it is beneficial to focus on genes that exhibit high cell-to-cell variation, that is they are highly expressed in some cells and lowly expressed in others.
3. To be able to compare normalised gene counts between genes, gene counts are further scaled to have zero mean and unit variance (z-score). This way, genes are equally weighted for downstream analysis.

In addition, we run:
4. SCTransform, a new and more sophisticated normalisation method that replaces the previous functions (normalisation, variable genes and scaling). Note that although results are kept for steps 1-3 (“RNA” assay), downstream analyses will default to resulting data of step 4 (“SCT” assay).

Finally, cell cycle scores are assigned to each cell based on its normalised expression of G2/M and S phase markers, after basic normalisation (step 1). These scores are visualised in a separate section further below. If specified in the above parameter section, cell cycle effects are removed during scaling (step 4). Note that removing all signal associated to cell cycle can negatively impact downstream analysis. For example, in differentiating processes, stem cells are quiescent and differentiated cells are proliferating (or vice versa), and removing all cell cycle effects can blur the distinction between these cells. As an alternative, we can remove the difference between G2M and S phase scores. This way, signals separating non-cycling and cycling cells will be maintained, while differences amongst proliferating cells will be removed. For a more detailed explanation, see the cell cycle vignette for Seurat (Unknown 2020). Cell cycle effect removed for this report: FALSE; all cell cycle effects removed for this report: FALSE.

While raw data is typically used for statistical tests such as finding marker genes, normalised data is mainly used for visualising gene expression values. Scaled data include variable genes only, potentially without cell cycle effects, and are mainly used to determine the structure of the dataset(s) with Principal Component Analysis, and indirectly to cluster and visualise cells in 2D space.

Variable genes

Experience shows that 1,000-2,000 genes with the highest cell-to-cell variation are often sufficient to describe the global structure of a single cell dataset. For example, cell type-specific genes typically highly vary between cells. Housekeeping genes, on the other hand, are similarly expressed across cells and can be disregarded to differentiate between cells.

To determine variable genes, we need to separate biological variability from technical variability. Technical variability arises especially for lowly expressed genes, where high variability corresponds to small absolute changes that we are not interested in. Here, we use the variance-stabilizing transformation (vst) method implemented in Seurat (Hafemeister and Satija (2019). This method first models the technical variability as a relationship between mean gene expression and variance using local polynomial regression. The model is then used to calculate the expected variance based on the observed mean gene expression. The difference between the observed and expected variance is called residual variance and likely reflects biological variability. The top 3,000 variable genes are used for further analysis.

Relative log expression

To better understand the efficiency of the applied normalisation procedures, we plot the relative log expression of genes in at most 100 randomly selected cells per sample before and after normalisation. This type of plot reveals unwanted variation in your data. The concept is taken from Gandolfo and Speed (2018). In brief, we remove variation between genes, leaving only variation between samples. If expression levels of most genes are similar in all cell types, sample heterogeneity is a sign of unwanted variation.

For each gene, we calculate its median expression across all cells, and then calculate the deviation from this median for each cell. For each cell, we plot the median expression (black), the interquartile range (lightgrey), whiskers defined as 1.5 times the interquartile range (darkgrey), and outliers (#1F77B4FF, #FF7F0EFF).

Advanced normalisation (SCT)

if (param$integrate_samples[["method"]]!="single") {
  
  # Markdown text for this section (do not change intendation)
cat("## Integration of multiple datasets\n\n")
  
  # Feature metadata is removed by Seurat merge entirely; save separately for each assay and add again afterwards
  assay_names = unique(purrr::flatten_chr(purrr::map(list_names(sc), function(n) { Seurat::Assays(sc[[n]]) } )))
  
  # Loop through all assays and accumulate meta data
  feature_data_for_assay = purrr::map(values_to_names(assay_names), function(a) {
    # "feature_id", "feature_name", "feature_type" are accumulated for all assays and stored just once
    # This step is skipped for assays that do not contain all three rypes of feature information
    contains_neccessary_columns = purrr::map_lgl(list_names(sc), function(n) { all(c("feature_id","feature_name","feature_type") %in% colnames(sc[[n]][[a]][[]])) })
    
    if (all(contains_neccessary_columns)) {
      feature_id_name_type = purrr::reduce(list_names(sc), function(x, y) {
        df_x = sc[[x]][[a]][[c("feature_id", "feature_name", "feature_type")]]
        df_y = sc[[y]][[a]][[c("feature_id", "feature_name", "feature_type")]]
        new_rows = which(!rownames(df_y) %in% rownames(df_x))
        rbind(df_x, df_y[new_rows,])
      })
      feature_id_name_type$row_names = rownames(feature_id_name_type)
    } else {
      feature_id_name_type = NULL
    }
    
    # For all other metadata, we prefix column names with the dataset
    other_feature_data = purrr::map(list_names(sc), function(n) {
      df = sc[[n]][[a]][[]] %>% dplyr::select(-dplyr::one_of("feature_id","feature_name","feature_type"))
      colnames(df) = paste(n, colnames(df), sep=".")
      df$row_names = rownames(df)
      return(df)
    })
    
    # Now join everything by row_names by full outer join
    if(!is.null(feature_id_name_type)) {
      feature_data = purrr::reduce(c(list(feature_id_name_type=feature_id_name_type),other_feature_data), dplyr::full_join, by="row_names")
    } else {
      feature_data = purrr::reduce(other_feature_data, dplyr::full_join, by="row_names")
    }
    rownames(feature_data) = feature_data$row_names
    feature_data$row_names = NULL
    
    return(feature_data)
  })
}

Integration of multiple datasets

# Standard method for integrating multiple samples. Best performance but computationally intensive.
if (param$integrate_samples[["method"]]=="standard") {
  # Note "Assay names should only have numbers and letters: Warnung: Keys should be one or more alphanumeric characters followed by an underscore, setting key from rna_integrated_ to rnaintegrated_" (seurat/R/object.R)
  
  # The integration step will temporarily occupy lots of memory. However, R has problems with freeing unused memory.
  # By wrapping the steps into a function, hopefully this works a bit better.
  run_standard_integration = function(sc_objs, ndims=30, vars_to_regress=c()) {
    # Find integration anchors for assay RNA
    integrate_RNA_features = Seurat::SelectIntegrationFeatures(object.list=sc_objs, nfeatures=3000, assay=rep("SCT", length(sc)), verbose=FALSE)
    integrate_RNA_anchors = Seurat::FindIntegrationAnchors(object.list=sc_objs, normalization.method="LogNormalize", anchor.features=integrate_RNA_features, dims=1:ndims, assay=rep("RNA", length(sc_objs)), verbose=FALSE)
    tmp_sc_objs = Seurat::IntegrateData(integrate_RNA_anchors, new.assay.name="RNAintegrated", normalization.method="LogNormalize", dims=1:ndims, verbose=FALSE)
    
    sc_obj_RNAintegrated_assay = Seurat::GetAssay(tmp_sc_objs, assay="RNAintegrated")
    rm(tmp_sc_objs, integrate_RNA_anchors, integrate_RNA_features)
    
    # Find integration anchors for assay SCT
    integrate_SCT_features = SelectIntegrationFeatures(object.list=sc_objs, nfeatures=3000, assay=rep("SCT", length(sc)), verbose=FALSE)
    sc_objs = PrepSCTIntegration(object.list=sc_objs, anchor.features=integrate_SCT_features, verbose=TRUE)
    integrate_SCT_anchors = FindIntegrationAnchors(object.list=sc_objs, normalization.method="SCT", anchor.features=integrate_SCT_features, assay=rep("SCT", length(sc_objs)), verbose=FALSE)
    sc_objs = Seurat::IntegrateData(integrate_SCT_anchors, new.assay.name="SCTintegrated", normalization.method="SCT", dims=1:ndims, verbose=FALSE)
    sc_objs[["RNAintegrated"]] = sc_obj_RNAintegrated_assay
    rm(integrate_SCT_anchors, integrate_SCT_features, sc_obj_RNAintegrated_assay)
    
    # scale data: according to seurat, this is needed only for the integrated RNA assay
    sc_objs = Seurat::ScaleData(sc_objs, features=rownames(sc_objs[["RNAintegrated"]]), verbose=FALSE, vars.to.regress=vars_to_regress, assay="RNAintegrated")
    
    # Call garbage collector to free memory (hope it helps)
    gc()
    return(sc_objs)
  }
  
  # call function
  sc = run_standard_integration(sc, ndims=param$integrate_samples[["dimensions"]], vars_to_regress=param$vars_to_regress)
}

Dimensionality reduction

A single-cell dataset of 20,000 genes and 5,000 cells has 20,000 dimensions. At this point of the analysis, we have already reduced the dimensionality of the dataset to 3,000 variable genes. The biological manifold however can be described by far fewer dimensions than the number of (variable) genes. Dimension reduction methods aim to find these dimensions. There are two general purposes for dimension reduction methods: to summarise a dataset, and to visualise a dataset.

We use Principal Component Analysis (PCA) to summarise a dataset, overcoming noise and reducing the data to its essential components. Each principal component (PC) represents a “metafeature” that combines information across a correlated gene set. Later, we use Uniform Manifold Approximation and Projection (UMAP) to visualise the dataset, placing similar cells together in 2D space, see below.

To decide how many PCs to include in downstream analyses, we visualize cells and genes that define the PCA.

Dimensionality of the dataset

We next need to decide how many PCs we want to use for our analyses. The following “Elbow plot” is designed to help us make an informed decision. PCs are ranked based on the percentage of variance they explain.

For your dataset, we decided to go for 10 PCs.

Downstream analysis

Visualisation with UMAP

We use a UMAP to visualise and explore a dataset. The goal is to place similar cells together in 2D space, and learn about the biology underlying the data. Cells are color-coded according to the graph-based clustering, and clusters typcially co-localise on the UMAP.

Take care not to mis-read a UMAP:

  • Parameters influence the plot (we use defaults here)
  • Cluster sizes relative to each other mean nothing, since the method has a local notion of distance
  • Distances between clusters might not mean anything
  • You may need more than one plot

For a nice read to intuitively understand UMAP, see (“Understanding Umap” 2019).

The following table shows the number of cells per sample per cluster:

  • n: Number of cells per sample per cluster
  • perc: Percentage of cells per sample per cluster compared to all other cells of that sample

In case the dataset contains 2 or more samples, we also calculate whether or not the number of cells of a sample in a cluster is significantly higher than expected:

  • oddsRatio: Odds ratio calculated for cluster c1 and sample s1 as (# cells s1 in c1 / # cells not s1 in c1) / (# cells s1 not in c1 / # cells not s1 not in c1)
  • p: P-value calculated with a fisher test to test whether “n” is higher than expected
Number of cells per cluster per sample
pbmc_10x.n pbmc_10x.perc pbmc_10x.oddsRatio pbmc_10x.p pbmc_smartseq2.sample1.n pbmc_smartseq2.sample1.perc pbmc_smartseq2.sample1.oddsRatio pbmc_smartseq2.sample1.p
Cluster_1 586 16.24 1.11 0.29 46 14.84 0.90 0.76
Cluster_2 509 14.11 1.65 0.01 28 9.03 0.60 1.00
Cluster_3 458 12.69 1.11 0.33 36 11.61 0.90 0.74
Cluster_4 410 11.36 1.53 0.03 24 7.74 0.65 0.98
Cluster_5 367 10.17 0.68 0.99 44 14.19 1.46 0.02
Cluster_6 334 9.26 0.80 0.90 35 11.29 1.25 0.14
Cluster_7 298 8.26 0.71 0.97 35 11.29 1.41 0.05
Cluster_8 271 7.51 1.32 0.16 18 5.81 0.76 0.89
Cluster_9 161 4.46 0.92 0.68 15 4.84 1.09 0.42
Cluster_10 123 3.41 1.33 0.28 8 2.58 0.75 0.83
Cluster_11 52 1.44 0.27 1.00 16 5.16 3.72 0.00
Cluster_12 39 1.08 0.67 0.87 5 1.61 1.50 0.27

Cell Cycle Effect

How much do gene expression profiles in the dataset reflect the cell cycle phases the single cells were in? After initial normalisation, we determined the effects of cell cycle heterogeneity by calculating a score for each cell based on its expression of G2M and S phase markers. Scoring is based on the strategy described in Tirosh et al. (2016), and human gene symbols are translated to gene symbols of the species of interest using biomaRt. This section of the report visualises the above calculated cell cycle scores.

Known marker genes

Do cells in individual clusters express provided known marker genes?

You provided 7 list(s) of known marker genes. In the following tabs, you find:

  • Dot plots for all gene lists containing at most 50 genes
  • Average feature plots for all gene lists containing at least 10 genes
  • Individual feature plots for all genes if there are no more than 30 genes in total

Differentially expressed genes

We next identify genes that are differentially expressed in one cluster compared to all other clusters, based on raw “RNA” data and the method “MAST”. Resulting p-values are adjusted using the Bonferroni method. The names of differentially expressed genes per cluster, alongside statistical measures and additional gene annotation are written to file.

Top 2 DEGs per cell cluster
p_val avg_logFC pct.1 pct.2 p_val_adj cluster gene
0 0.9913495 0.783 0.419 0 1 DUSP2
0 1.2146775 0.394 0.089 0 1 GZMK
0 1.3311606 0.963 0.346 0 2 LTB
0 1.2017087 0.961 0.304 0 2 IL7R
0 1.6791687 0.992 0.282 0 3 GNLY
0 0.8093787 0.996 0.531 0 3 CCL5.1
0 2.7707200 1.000 0.018 0 4 CD79A
0 3.5828496 0.608 0.015 0 4 IGKC
0 4.1959026 0.998 0.062 0 5 S100A9
0 4.1330835 1.000 0.102 0 5 LYZ
0 2.0929338 0.989 0.308 0 6 GNLY
0 1.6593385 0.976 0.357 0 6 PRF1
0 1.3116630 0.997 0.351 0 7 GZMH
0 1.4312706 0.976 0.316 0 7 FGFBP2
0 1.1642258 0.720 0.077 0 8 LEF1
0 1.1937578 0.747 0.093 0 8 CCR7
0 1.9924267 0.949 0.192 0 9 KLRB1
0 1.4526692 0.602 0.104 0 9 AQP3
0 2.7295526 1.000 0.198 0 10 LST1
0 2.4820210 1.000 0.166 0 10 AIF1
0 5.1206311 0.926 0.017 0 11 PPBP
0 4.8122067 0.882 0.076 0 11 NRGN
0 2.5224098 1.000 0.550 0 12 HLA-DPA1.2
0 2.5958110 1.000 0.186 0 12 CST3
# Add Ensembl annotation
sc_markers_ensembl = seurat_rowname_to_ensembl[sc_markers[,"gene"]]
sc_markers_annot = cbind(sc_markers, annot_ensembl[sc_markers_ensembl,])

# Output in Excel sheet
sc_markers_lst = lapply(levels(sc_markers_annot$cluster), function(x) {sc_markers_annot %>% dplyr::filter(cluster==x)})
names(sc_markers_lst) = paste0("cluster", levels(sc_markers$cluster))
openxlsx::write.xlsx(sc_markers_lst, file=paste0(param$path_out, "/markers.xlsx"))

# Filter markers based on p-value and fold-change 
sc_markers_filt = sc_markers %>% 
  dplyr::filter(p_val_adj <= param$padj) %>% 
  dplyr::filter((avg_logFC <= -param$log2fc) | (avg_logFC >= param$log2fc)) %>% 
  as.data.frame()
sc_markers_filt_down = sc_markers_filt %>% 
  dplyr::filter(avg_logFC <= -param$log2fc) %>% 
  as.data.frame()
sc_markers_filt_up = sc_markers_filt %>% 
  dplyr::filter(avg_logFC >= param$log2fc) %>% 
  as.data.frame()

# Number of DEGs per cluster
cluster_all = sort(unique(sc_markers[,"cluster"]))
sc_markers_filt_n = cbind(Cluster=cluster_all, 
                          Down=sapply(cluster_all, function(x) sum(sc_markers_filt_down$cluster == x)), 
                          Up=sapply(cluster_all, function(x) sum(sc_markers_filt_up$cluster == x))) %>% 
  as.data.frame() %>% 
  tidyr::pivot_longer(cols=c("Down", "Up"), 
                      names_to="Direction", 
                      values_to="n")
sc_markers_filt_n$Cluster = as.factor(sc_markers_filt_n$Cluster)

p = ggplot(sc_markers_filt_n, aes(x=Cluster, y=n, fill=Direction)) + geom_bar(stat="identity") 
p = PlotMystyle(p, 
                title=paste0("Number of DEGs per cell cluster\n(FC=", 2^param$log2fc, ", adj. p-value=", param$padj, ")"), 
                fill=c("steelblue", "darkgoldenrod1"))
p

Visualisation of differentially expressed genes

The following plots are exemplary to how we can visualize differentially expressed genes using the Seurat R-package. The selected genes are the top differentially expressed genes for each cluster, respectively.

Functional enrichment analysis

To gain first insights into potential functions of cells in a cluster, we test for over-representation of functional terms amongst up- and down-regulated genes of each cluster. Over-represented terms are written to file.

We first translate gene symbols of up- and down-regulated genes per cluster into Entrez gene symbols, and then use the “enrichR” R-package to access the “Enrichr” website (Chen 2020). You can choose to test functional enrichment from a wide range of databases:

Enrichr databases
geneCoverage genesPerTerm libraryName link numTerms
13362 275 Genome_Browser_PWMs http://hgdownload.cse.ucsc.edu/goldenPath/hg18/database/ 615
27884 1284 TRANSFAC_and_JASPAR_PWMs http://jaspar.genereg.net/html/DOWNLOAD/ 326
6002 77 Transcription_Factor_PPIs 290
47172 1370 ChEA_2013 http://amp.pharm.mssm.edu/lib/cheadownload.jsp 353
47107 509 Drug_Perturbations_from_GEO_2014 http://www.ncbi.nlm.nih.gov/geo/ 701
21493 3713 ENCODE_TF_ChIP-seq_2014 http://genome.ucsc.edu/ENCODE/downloads.html 498
1295 18 BioCarta_2013 https://cgap.nci.nih.gov/Pathways/BioCarta_Pathways 249
3185 73 Reactome_2013 http://www.reactome.org/download/index.html 78
2854 34 WikiPathways_2013 http://www.wikipathways.org/index.php/Download_Pathways 199
15057 300 Disease_Signatures_from_GEO_up_2014 http://www.ncbi.nlm.nih.gov/geo/ 142
4128 48 KEGG_2013 http://www.kegg.jp/kegg/download/ 200
34061 641 TF-LOF_Expression_from_GEO http://www.ncbi.nlm.nih.gov/geo/ 269
7504 155 TargetScan_microRNA http://www.targetscan.org/cgi-bin/targetscan/data_download.cgi?db=vert_61 222
16399 247 PPI_Hub_Proteins http://amp.pharm.mssm.edu/X2K 385
12753 57 GO_Molecular_Function_2015 http://www.geneontology.org/GO.downloads.annotations.shtml 1136
23726 127 GeneSigDB http://genesigdb.org/genesigdb/downloadall.jsp 2139
32740 85 Chromosome_Location http://software.broadinstitute.org/gsea/msigdb/index.jsp 386
13373 258 Human_Gene_Atlas http://biogps.org/downloads/ 84
19270 388 Mouse_Gene_Atlas http://biogps.org/downloads/ 96
13236 82 GO_Cellular_Component_2015 http://www.geneontology.org/GO.downloads.annotations.shtml 641
14264 58 GO_Biological_Process_2015 http://www.geneontology.org/GO.downloads.annotations.shtml 5192
3096 31 Human_Phenotype_Ontology http://www.human-phenotype-ontology.org/ 1779
22288 4368 Epigenomics_Roadmap_HM_ChIP-seq http://www.roadmapepigenomics.org/ 383
4533 37 KEA_2013 http://amp.pharm.mssm.edu/lib/keacommandline.jsp 474
10231 158 NURSA_Human_Endogenous_Complexome https://www.nursa.org/nursa/index.jsf 1796
2741 5 CORUM http://mips.helmholtz-muenchen.de/genre/proj/corum/ 1658
5655 342 SILAC_Phosphoproteomics http://amp.pharm.mssm.edu/lib/keacommandline.jsp 84
10406 715 MGI_Mammalian_Phenotype_Level_3 http://www.informatics.jax.org/ 71
10493 200 MGI_Mammalian_Phenotype_Level_4 http://www.informatics.jax.org/ 476
11251 100 Old_CMAP_up http://www.broadinstitute.org/cmap/ 6100
8695 100 Old_CMAP_down http://www.broadinstitute.org/cmap/ 6100
1759 25 OMIM_Disease http://www.omim.org/downloads 90
2178 89 OMIM_Expanded http://www.omim.org/downloads 187
851 15 VirusMINT http://mint.bio.uniroma2.it/download.html 85
10061 106 MSigDB_Computational http://www.broadinstitute.org/gsea/msigdb/collections.jsp 858
11250 166 MSigDB_Oncogenic_Signatures http://www.broadinstitute.org/gsea/msigdb/collections.jsp 189
15406 300 Disease_Signatures_from_GEO_down_2014 http://www.ncbi.nlm.nih.gov/geo/ 142
17711 300 Virus_Perturbations_from_GEO_up http://www.ncbi.nlm.nih.gov/geo/ 323
17576 300 Virus_Perturbations_from_GEO_down http://www.ncbi.nlm.nih.gov/geo/ 323
15797 176 Cancer_Cell_Line_Encyclopedia https://portals.broadinstitute.org/ccle/home 967
12232 343 NCI-60_Cancer_Cell_Lines http://biogps.org/downloads/ 93
13572 301 Tissue_Protein_Expression_from_ProteomicsDB https://www.proteomicsdb.org/ 207
6454 301 Tissue_Protein_Expression_from_Human_Proteome_Map http://www.humanproteomemap.org/index.php 30
3723 47 HMDB_Metabolites http://www.hmdb.ca/downloads 3906
7588 35 Pfam_InterPro_Domains ftp://ftp.ebi.ac.uk/pub/databases/interpro/ 311
7682 78 GO_Biological_Process_2013 http://www.geneontology.org/GO.downloads.annotations.shtml 941
7324 172 GO_Cellular_Component_2013 http://www.geneontology.org/GO.downloads.annotations.shtml 205
8469 122 GO_Molecular_Function_2013 http://www.geneontology.org/GO.downloads.annotations.shtml 402
13121 305 Allen_Brain_Atlas_up http://www.brain-map.org/ 2192
26382 1811 ENCODE_TF_ChIP-seq_2015 http://genome.ucsc.edu/ENCODE/downloads.html 816
29065 2123 ENCODE_Histone_Modifications_2015 http://genome.ucsc.edu/ENCODE/downloads.html 412
280 9 Phosphatase_Substrates_from_DEPOD http://www.koehn.embl.de/depod/ 59
13877 304 Allen_Brain_Atlas_down http://www.brain-map.org/ 2192
15852 912 ENCODE_Histone_Modifications_2013 http://genome.ucsc.edu/ENCODE/downloads.html 109
4320 129 Achilles_fitness_increase http://www.broadinstitute.org/achilles 216
4271 128 Achilles_fitness_decrease http://www.broadinstitute.org/achilles 216
10496 201 MGI_Mammalian_Phenotype_2013 http://www.informatics.jax.org/ 476
1678 21 BioCarta_2015 https://cgap.nci.nih.gov/Pathways/BioCarta_Pathways 239
756 12 HumanCyc_2015 http://humancyc.org/ 125
3800 48 KEGG_2015 http://www.kegg.jp/kegg/download/ 179
2541 39 NCI-Nature_2015 http://pid.nci.nih.gov/ 209
1918 39 Panther_2015 http://www.pantherdb.org/ 104
5863 51 WikiPathways_2015 http://www.wikipathways.org/index.php/Download_Pathways 404
6768 47 Reactome_2015 http://www.reactome.org/download/index.html 1389
25651 807 ESCAPE http://www.maayanlab.net/ESCAPE/ 315
19129 1594 HomoloGene http://www.ncbi.nlm.nih.gov/homologene 12
23939 293 Disease_Perturbations_from_GEO_down http://www.ncbi.nlm.nih.gov/geo/ 839
23561 307 Disease_Perturbations_from_GEO_up http://www.ncbi.nlm.nih.gov/geo/ 839
23877 302 Drug_Perturbations_from_GEO_down http://www.ncbi.nlm.nih.gov/geo/ 906
15886 9 Genes_Associated_with_NIH_Grants https://grants.nih.gov/grants/oer.htm 32876
24350 299 Drug_Perturbations_from_GEO_up http://www.ncbi.nlm.nih.gov/geo/ 906
3102 25 KEA_2015 http://amp.pharm.mssm.edu/Enrichr 428
31132 298 Gene_Perturbations_from_GEO_up http://www.ncbi.nlm.nih.gov/geo/ 2460
30832 302 Gene_Perturbations_from_GEO_down http://www.ncbi.nlm.nih.gov/geo/ 2460
48230 1429 ChEA_2015 http://amp.pharm.mssm.edu/Enrichr 395
5613 36 dbGaP http://www.ncbi.nlm.nih.gov/gap 345
9559 73 LINCS_L1000_Chem_Pert_up https://clue.io/ 33132
9448 63 LINCS_L1000_Chem_Pert_down https://clue.io/ 33132
16725 1443 GTEx_Tissue_Sample_Gene_Expression_Profiles_down http://www.gtexportal.org/ 2918
19249 1443 GTEx_Tissue_Sample_Gene_Expression_Profiles_up http://www.gtexportal.org/ 2918
15090 282 Ligand_Perturbations_from_GEO_down http://www.ncbi.nlm.nih.gov/geo/ 261
16129 292 Aging_Perturbations_from_GEO_down http://www.ncbi.nlm.nih.gov/geo/ 286
15309 308 Aging_Perturbations_from_GEO_up http://www.ncbi.nlm.nih.gov/geo/ 286
15103 318 Ligand_Perturbations_from_GEO_up http://www.ncbi.nlm.nih.gov/geo/ 261
15022 290 MCF7_Perturbations_from_GEO_down http://www.ncbi.nlm.nih.gov/geo/ 401
15676 310 MCF7_Perturbations_from_GEO_up http://www.ncbi.nlm.nih.gov/geo/ 401
15854 279 Microbe_Perturbations_from_GEO_down http://www.ncbi.nlm.nih.gov/geo/ 312
15015 321 Microbe_Perturbations_from_GEO_up http://www.ncbi.nlm.nih.gov/geo/ 312
3788 159 LINCS_L1000_Ligand_Perturbations_down https://clue.io/ 96
3357 153 LINCS_L1000_Ligand_Perturbations_up https://clue.io/ 96
12668 300 L1000_Kinase_and_GPCR_Perturbations_down https://clue.io/ 3644
12638 300 L1000_Kinase_and_GPCR_Perturbations_up https://clue.io/ 3644
8973 64 Reactome_2016 http://www.reactome.org/download/index.html 1530
7010 87 KEGG_2016 http://www.kegg.jp/kegg/download/ 293
5966 51 WikiPathways_2016 http://www.wikipathways.org/index.php/Download_Pathways 437
15562 887 ENCODE_and_ChEA_Consensus_TFs_from_ChIP-X 104
17850 300 Kinase_Perturbations_from_GEO_down http://www.ncbi.nlm.nih.gov/geo/ 285
17660 300 Kinase_Perturbations_from_GEO_up http://www.ncbi.nlm.nih.gov/geo/ 285
1348 19 BioCarta_2016 http://cgap.nci.nih.gov/Pathways/BioCarta_Pathways 237
934 13 HumanCyc_2016 http://humancyc.org/ 152
2541 39 NCI-Nature_2016 http://pid.nci.nih.gov/ 209
2041 42 Panther_2016 http://www.pantherdb.org/pathway/ 112
5209 300 DrugMatrix https://ntp.niehs.nih.gov/drugmatrix/ 7876
49238 1550 ChEA_2016 http://amp.pharm.mssm.edu/Enrichr 645
2243 19 huMAP http://proteincomplexes.org/ 995
19586 545 Jensen_TISSUES http://tissues.jensenlab.org/ 1842
22440 505 RNA-Seq_Disease_Gene_and_Drug_Signatures_from_GEO http://www.ncbi.nlm.nih.gov/geo/ 1302
8184 24 MGI_Mammalian_Phenotype_2017 http://www.informatics.jax.org/ 5231
18329 161 Jensen_COMPARTMENTS http://compartments.jensenlab.org/ 2283
15755 28 Jensen_DISEASES http://diseases.jensenlab.org/ 1811
10271 22 BioPlex_2017 http://bioplex.hms.harvard.edu/ 3915
10427 38 GO_Cellular_Component_2017 http://www.geneontology.org/ 636
10601 25 GO_Molecular_Function_2017 http://www.geneontology.org/ 972
13822 21 GO_Biological_Process_2017 http://www.geneontology.org/ 3166
8002 143 GO_Cellular_Component_2017b http://www.geneontology.org/ 816
10089 45 GO_Molecular_Function_2017b http://www.geneontology.org/ 3271
13247 49 GO_Biological_Process_2017b http://www.geneontology.org/ 10125
21809 2316 ARCHS4_Tissues http://amp.pharm.mssm.edu/archs4 108
23601 2395 ARCHS4_Cell-lines http://amp.pharm.mssm.edu/archs4 125
20883 299 ARCHS4_IDG_Coexp http://amp.pharm.mssm.edu/archs4 352
19612 299 ARCHS4_Kinases_Coexp http://amp.pharm.mssm.edu/archs4 498
25983 299 ARCHS4_TFs_Coexp http://amp.pharm.mssm.edu/archs4 1724
19500 137 SysMyo_Muscle_Gene_Sets http://sys-myo.rhcloud.com/ 1135
14893 128 miRTarBase_2017 http://mirtarbase.mbc.nctu.edu.tw/ 3240
17598 1208 TargetScan_microRNA_2017 http://www.targetscan.org/ 683
5902 109 Enrichr_Libraries_Most_Popular_Genes http://amp.pharm.mssm.edu/Enrichr 121
12486 299 Enrichr_Submissions_TF-Gene_Coocurrence http://amp.pharm.mssm.edu/Enrichr 1722
1073 100 Data_Acquisition_Method_Most_Popular_Genes http://amp.pharm.mssm.edu/Enrichr 12
19513 117 DSigDB http://tanlab.ucdenver.edu/DSigDB/DSigDBv1.0/ 4026
14433 36 GO_Biological_Process_2018 http://www.geneontology.org/ 5103
8655 61 GO_Cellular_Component_2018 http://www.geneontology.org/ 446
11459 39 GO_Molecular_Function_2018 http://www.geneontology.org/ 1151
19741 270 TF_Perturbations_Followed_by_Expression http://www.ncbi.nlm.nih.gov/geo/ 1958
27360 802 Chromosome_Location_hg19 http://hgdownload.cse.ucsc.edu/downloads.html 36
13072 26 NIH_Funded_PIs_2017_Human_GeneRIF https://www.ncbi.nlm.nih.gov/pubmed/ 5687
13464 45 NIH_Funded_PIs_2017_Human_AutoRIF https://www.ncbi.nlm.nih.gov/pubmed/ 12558
13787 200 Rare_Diseases_AutoRIF_ARCHS4_Predictions https://amp.pharm.mssm.edu/geneshot/ 3725
13929 200 Rare_Diseases_GeneRIF_ARCHS4_Predictions https://www.ncbi.nlm.nih.gov/gene/about-generif 2244
16964 200 NIH_Funded_PIs_2017_AutoRIF_ARCHS4_Predictions https://www.ncbi.nlm.nih.gov/pubmed/ 12558
17258 200 NIH_Funded_PIs_2017_GeneRIF_ARCHS4_Predictions https://www.ncbi.nlm.nih.gov/pubmed/ 5684
10352 58 Rare_Diseases_GeneRIF_Gene_Lists https://www.ncbi.nlm.nih.gov/gene/about-generif 2244
10471 76 Rare_Diseases_AutoRIF_Gene_Lists https://amp.pharm.mssm.edu/geneshot/ 3725
12419 491 SubCell_BarCode http://www.subcellbarcode.org/ 104
19378 37 GWAS_Catalog_2019 https://www.ebi.ac.uk/gwas 1737
6201 45 WikiPathways_2019_Human https://www.wikipathways.org/ 472
4558 54 WikiPathways_2019_Mouse https://www.wikipathways.org/ 176
3264 22 TRRUST_Transcription_Factors_2019 https://www.grnpedia.org/trrust/ 571
7802 92 KEGG_2019_Human https://www.kegg.jp/ 308
8551 98 KEGG_2019_Mouse https://www.kegg.jp/ 303
12444 23 InterPro_Domains_2019 https://www.ebi.ac.uk/interpro/ 1071
9000 20 Pfam_Domains_2019 https://pfam.xfam.org/ 608
7744 363 DepMap_WG_CRISPR_Screens_Broad_CellLines_2019 https://depmap.org/ 558
6204 387 DepMap_WG_CRISPR_Screens_Sanger_CellLines_2019 https://depmap.org/ 325
13420 32 MGI_Mammalian_Phenotype_Level_4_2019 http://www.informatics.jax.org/ 5261
14148 122 UK_Biobank_GWAS_v1 https://www.ukbiobank.ac.uk/tag/gwas/ 857
9813 49 BioPlanet_2019 https://tripod.nih.gov/bioplanet/ 1510
1397 13 ClinVar_2019 https://www.ncbi.nlm.nih.gov/clinvar/ 182
9116 22 PheWeb_2019 http://pheweb.sph.umich.edu/ 1161
17464 63 DisGeNET https://www.disgenet.org 9828
394 73 HMS_LINCS_KinomeScan http://lincs.hms.harvard.edu/kinomescan/ 148
11851 586 CCLE_Proteomics_2020 https://portals.broadinstitute.org/ccle 378
8189 421 ProteomicsDB_2020 https://www.proteomicsdb.org/ 913
18704 100 lncHUB_lncRNA_Co-Expression https://amp.pharm.mssm.edu/lnchub/ 3729
5605 39 Virus-Host_PPI_P-HIPSTer_2020 http://phipster.org/ 6715
5718 31 Elsevier_Pathway_Collection http://www.transgene.ru/disease-pathways/ 1721
14156 40 Table_Mining_of_CRISPR_Studies 802
# DEGs up and down per cluster
cluster_all = sort(unique(sc_markers[,"cluster"]))
genesets_up = lapply(cluster_all, function(x) {
  tmp = sc_markers_filt_up %>% 
    dplyr::filter(cluster==x) %>% 
    dplyr::pull(gene)
  # Pick the first matching Entrez symbol
  tmp = sapply(tmp, function(x) seurat_rowname_to_entrez[[x]][1]) %>% 
    na.exclude() %>% unique()
  return(tmp)
})
genesets_down = lapply(cluster_all, function(x) {
  tmp = sc_markers_filt_down %>% 
    dplyr::filter(cluster==x) %>% 
    dplyr::pull(gene)
  # Pick the first matching Entrez symbol
  tmp = sapply(tmp, function(x) seurat_rowname_to_entrez[[x]][1]) %>% 
    na.exclude() %>% unique()
  return(tmp)
})
names(genesets_up) = paste0("DEG_up_cluster_", cluster_all)
names(genesets_down) = paste0("DEG_down_cluster_", cluster_all)
genesets = c(genesets_up, genesets_down)
  
# Loop through gene lists
enriched = list()
for (i in 1:length(genesets)) {
  if (length(genesets[[i]]) >= 3) {
    message("Geneset ", names(genesets)[i])
    enriched[[i]] = enrichR::enrichr(genesets[[i]], databases=param$enrichr_dbs)
  } else { 
    message("Geneset ", names(genesets)[i], " has less than 3 genes; skip enrichr")
    enriched[[i]] = NA
  }
}
names(enriched) = names(genesets)

# Write enrichment results to file
enriched_top = matrix(NA, nrow=0, ncol=6)
colnames(enriched_top) = c("GeneSet", "Database", "Term", "Overlap", "Adjusted_pval", "Genes")
for (i in 1:length(enriched)) { 
  if (!is.null(enriched[[i]])) { 
    openxlsx::write.xlsx(enriched[[i]], file=paste0(param$path_out, "/Functions_", names(enriched)[i], ".xlsx"))
  }
}

The following table contains the top enriched term per geneset and database.

Top enriched term per geneset
GeneSet Database Term Overlap Adjusted_pval Genes
DEG_up_cluster_1 GO_Molecular_Function_2018 MHC class I protein binding (GO:0042288) 2/14 0.0052306512098949 CD8B;CD8A
DEG_up_cluster_1 GO_Biological_Process_2018 T cell activation (GO:0042110) 2/88 0.968354596870744 CD8B;CD8A
DEG_up_cluster_1 GO_Cellular_Component_2018 T cell receptor complex (GO:0042101) 2/16 0.0026722041846523 CD8B;CD8A
DEG_up_cluster_2 GO_Molecular_Function_2018 intermediate filament binding (GO:0019215) 1/11 1 VIM
DEG_up_cluster_2 GO_Biological_Process_2018 cytokine-mediated signaling pathway (GO:0019221) 6/633 0.0121089276129934 GATA3;LTB;VIM;PLP2;IL7R;JUNB
DEG_up_cluster_2 GO_Cellular_Component_2018 membrane raft (GO:0045121) 2/119 1 LDHB;MAL
DEG_up_cluster_3 GO_Molecular_Function_2018 phospholipase activator activity (GO:0016004) 1/6 1 CCL5
DEG_up_cluster_3 GO_Biological_Process_2018 lymphocyte mediated immunity (GO:0002449) 2/11 0.0923412918635355 CD8A;KLRD1
DEG_up_cluster_3 GO_Cellular_Component_2018 platelet dense granule lumen (GO:0031089) 1/14 1 CTSW
DEG_up_cluster_4 GO_Molecular_Function_2018 MHC class II protein complex binding (GO:0023026) 6/16 5.05975921666157e-08 CD74;HLA-DMA;HLA-DMB;HLA-DRA;MS4A1;HLA-DRB1
DEG_up_cluster_4 GO_Biological_Process_2018 antigen receptor-mediated signaling pathway (GO:0050851) 17/257 4.72487752063177e-12 BLK;IGHM;MEF2C;CD79B;CD79A;IGHG1;IGKC;CD19;IGLC3;IGHD;HLA-DPB1;HLA-DRA;IGLC2;HLA-DQA1;HLA-DRB1;HLA-DPA1;HLA-DQB1
DEG_up_cluster_4 GO_Cellular_Component_2018 MHC class II protein complex (GO:0042613) 9/14 3.19242087086651e-16 CD74;HLA-DMA;HLA-DMB;HLA-DPB1;HLA-DRA;HLA-DQA1;HLA-DRB1;HLA-DPA1;HLA-DQB1
DEG_up_cluster_5 GO_Molecular_Function_2018 protein homodimerization activity (GO:0042803) 31/664 0.00203330978499852 CEBPA;CEBPB;BCL2A1;HEXB;LRRK2;PLEK;PYGL;TYMP;PYCARD;GLIPR2;PSAP;HMOX1;LRRFIP1;SRGAP2;S100A11;S100A10;MCL1;BNIP3L;FCER1G;WARS;GCA;STAT1;SLC11A1;ADA2;IRAK3;LILRB2;CCDC88A;CD4;JAML;S100A6;PECAM1
DEG_up_cluster_5 GO_Biological_Process_2018 neutrophil activation involved in immune response (GO:0002283) 91/483 8.57600779873021e-63 FCN1;SERPINA1;ITGAM;HEXB;RAB3D;CTSZ;SLC2A3;PYGL;TCIRG1;CTSS;SIRPB1;PYCARD;LGALS3;GLIPR1;FTH1;ANPEP;LAMP2;TIMP2;COTL1;ITGAX;CTSH;SIRPA;QSOX1;CD36;RAC1;CTSD;CD33;CTSB;ACTR2;SERPINB1;FCER1G;SYK;ANXA2;CD93;SLC11A1;PLAUR;CYBB;RNASE2;TNFRSF1B;RHOA;CKAP4;OSCAR;FGR;RAP2B;TYROBP;RAB31;DOK3;NPC2;PECAM1;ALDOA;S100A9;S100A8;TLR2;FTL;CFD;GRN;ASAH1;CLEC12A;BRI3;GSTP1;STXBP2;C5AR1;FGL2;FPR1;MGST1;IQGAP1;CFP;GNS;CST3;SDCBP;CREG1;PSAP;S100A12;NCKAP1L;CD14;LTA4H;S100A11;GCA;FUCA1;CMTM6;ADA2;LILRB2;LYZ;LILRB3;CPPED1;BST1;FCGR2A;SELL;MNDA;PTPN6;CD68
DEG_up_cluster_5 GO_Cellular_Component_2018 tertiary granule (GO:0070820) 35/164 8.50038554983416e-25 ASAH1;ITGAM;CLEC12A;STXBP2;FPR1;SLC2A3;TCIRG1;CFP;CTSS;CST3;LGALS3;FTH1;LAMP2;TIMP2;ITGAX;SIRPA;CTSH;NCKAP1L;QSOX1;LTA4H;RAC1;CTSD;CD33;FCER1G;CD93;SLC11A1;CYBB;LILRB2;LYZ;RHOA;OSCAR;DOK3;RAP2B;PTPN6;ALDOA
DEG_up_cluster_6 GO_Molecular_Function_2018 MHC protein complex binding (GO:0023023) 3/19 0.0190179456163385 KLRC2;KLRD1;KLRC1
DEG_up_cluster_6 GO_Biological_Process_2018 regulation of immune response (GO:0050776) 12/251 1.01266899694941e-08 FCGR3A;NCR3;TYROBP;KLRB1;ITGB2;KLRF1;KLRD1;KLRC1;KIR3DL2;CD247;HCST;KIR2DL3
DEG_up_cluster_6 GO_Cellular_Component_2018 T cell receptor complex (GO:0042101) 2/16 0.360090854988048 ZAP70;CD247
DEG_up_cluster_7 GO_Molecular_Function_2018 endopeptidase activity (GO:0004175) 5/354 0.0626278450321713 GZMM;GZMA;GZMB;CTSW;GZMH
DEG_up_cluster_7 GO_Biological_Process_2018 regulation of immune response (GO:0050776) 8/251 1.73652368233746e-06 CD8B;CD8A;TRAC;TRBC1;CD3G;KLRD1;CD3D;CD99
DEG_up_cluster_7 GO_Cellular_Component_2018 T cell receptor complex (GO:0042101) 4/16 1.28157914616563e-06 CD8B;CD8A;CD3G;CD3D
DEG_up_cluster_8 GO_Molecular_Function_2018 RNA binding (GO:0003723) 12/1387 0.00177473587265024 RPL30;NPM1;RPL32;RPS8;RPS5;RPL22;RCAN3;RPL13;NOSIP;RPS3A;EIF3E;RPS13
DEG_up_cluster_8 GO_Biological_Process_2018 nuclear-transcribed mRNA catabolic process, nonsense-mediated decay (GO:0000184) 9/112 9.29283022220641e-10 RPL30;RPL32;RPS8;RPS5;RPL22;RPL13;RPS3A;EIF3E;RPS13
DEG_up_cluster_8 GO_Cellular_Component_2018 cytosolic ribosome (GO:0022626) 8/124 1.22884997243549e-08 RPL30;RPL32;RPS8;RPS5;RPL22;RPL13;RPS3A;RPS13
DEG_up_cluster_9 GO_Molecular_Function_2018 C-C chemokine binding (GO:0019957) 2/8 0.0898232418047339 ZFP36;CXCR4
DEG_up_cluster_9 GO_Biological_Process_2018 negative regulation of I-kappaB kinase/NF-kappaB signaling (GO:0043124) 4/47 0.00601433425946692 NFKBIA;PER1;TNFAIP3;RORA
DEG_up_cluster_9 GO_Cellular_Component_2018 transcription factor TFIIIC complex (GO:0000127) 1/6 1 GTF3C1
DEG_up_cluster_10 GO_Molecular_Function_2018 protein kinase binding (GO:0019901) 22/495 0.00106947791409088 CSF1R;TCF7L2;WARS;MAP3K1;CTBP2;H2AFY;GSTP1;PLEK;RHOG;MSN;AP2A1;RHOC;ACTB;RHOA;RHOB;FGR;MAPKAPK3;CSK;PTPN6;PKN1;CALM2;CACUL1
DEG_up_cluster_10 GO_Biological_Process_2018 neutrophil degranulation (GO:0043312) 56/479 8.36993207998367e-33 FCN1;SERPINA1;CTSZ;GDI2;TCIRG1;CTSS;PYCARD;HK3;FTH1;TIMP2;COTL1;ITGAX;CTSC;CTSB;CAP1;FCER1G;ANXA2;SLC11A1;RNASET2;RHOG;PLAUR;CYBB;TNFRSF1B;RHOA;FGR;TYROBP;RAB31;NPC2;CAT;PECAM1;ALDOA;FTL;CFD;CSTB;ASAH1;CLEC12A;BRI3;GSTP1;STXBP2;C5AR1;FGL2;CFP;GNS;CST3;PSAP;LTA4H;S100A11;CMTM6;LILRB2;ARPC5;LILRB3;CPPED1;TSPAN14;FCGR2A;PTPN6;CD68
DEG_up_cluster_10 GO_Cellular_Component_2018 ficolin-1-rich granule (GO:0101002) 26/184 1.10335422560136e-16 FCN1;CFD;CSTB;ASAH1;SERPINA1;GSTP1;CTSZ;FGL2;TCIRG1;GNS;CTSS;CST3;HK3;FTH1;COTL1;TIMP2;ITGAX;LTA4H;CTSB;FCER1G;SLC11A1;ARPC5;LILRB2;RHOA;CAT;ALDOA
DEG_up_cluster_11 GO_Molecular_Function_2018 actin binding (GO:0003779) 19/254 3.3062051584876e-05 ITGB1;DBNL;DMTN;GSN;TPM4;WDR1;WIPF1;TPM3;ACTN1;ARPC1B;TPM1;PARVB;ARPC5;TMSB4X;COTL1;FLNA;MYH9;ALDOA;VCL
DEG_up_cluster_11 GO_Biological_Process_2018 platelet degranulation (GO:0002576) 25/124 5.85076042390064e-17 APP;CD63;SPARC;WDR1;ITGB3;ITGA2B;STXBP2;F13A1;CLU;THBS1;TMSB4X;FLNA;TIMP1;SRGN;TGFB1;ACTN1;PPBP;TUBA4A;CD9;TAGLN2;TLN1;ALDOA;VCL;FERMT3;PF4
DEG_up_cluster_11 GO_Cellular_Component_2018 secretory granule lumen (GO:0034774) 31/317 3.11697094129311e-13 APP;SPARC;F13A1;CLU;THBS1;CYB5R3;TMSB4X;COTL1;TIMP1;OSTF1;CTSA;SRGN;SERPINB1;DBNL;GSN;TGFB1;TRAPPC1;ACTN1;PPBP;ARPC5;TUBB4B;PRDX6;ARHGAP45;PKM;BIN2;LCN2;ALDOA;VCL;FERMT3;PF4;FTL
DEG_up_cluster_12 GO_Molecular_Function_2018 MHC class II protein complex binding (GO:0023026) 6/16 1.98163490767778e-06 CD74;HLA-DMA;HLA-DMB;HLA-DRA;HLA-DOA;HLA-DRB1
DEG_up_cluster_12 GO_Biological_Process_2018 antigen processing and presentation of exogenous peptide antigen via MHC class II (GO:0019886) 15/97 7.37013560695357e-12 CD74;HLA-DRB5;FCER1G;HLA-DMA;HLA-DMB;AP1S2;AP2S1;HLA-DPB1;HLA-DRA;HLA-DOA;HLA-DQA1;HLA-DRB1;LGMN;HLA-DPA1;HLA-DQB1
DEG_up_cluster_12 GO_Cellular_Component_2018 MHC class II protein complex (GO:0042613) 11/14 8.96763056988054e-19 CD74;HLA-DRB5;HLA-DMA;HLA-DMB;HLA-DPB1;HLA-DRA;HLA-DOA;HLA-DQA1;HLA-DRB1;HLA-DPA1;HLA-DQB1
DEG_down_cluster_1 GO_Molecular_Function_2018 MHC class II protein complex binding (GO:0023026) 3/16 0.00193713461963498 CD74;HLA-DRA;HLA-DRB1
DEG_down_cluster_1 GO_Biological_Process_2018 antigen processing and presentation of exogenous peptide antigen (GO:0002478) 7/97 4.80272962533632e-07 CD74;FCER1G;AP1S2;HLA-DRA;CTSS;HLA-DRB1;HLA-DPA1
DEG_down_cluster_1 GO_Cellular_Component_2018 MHC class II protein complex (GO:0042613) 4/14 1.81644797070467e-06 CD74;HLA-DRA;HLA-DRB1;HLA-DPA1
DEG_down_cluster_2 GO_Molecular_Function_2018 MHC protein complex binding (GO:0023023) 5/19 1.26946118172402e-06 CD74;HLA-DMA;HLA-DRA;KLRD1;HLA-DRB1
DEG_down_cluster_2 GO_Biological_Process_2018 antigen processing and presentation of exogenous peptide antigen (GO:0002478) 8/97 7.35030942410114e-07 CD74;HLA-DMA;HLA-DPB1;HLA-DRA;CTSD;CTSS;HLA-DRB1;HLA-DPA1
DEG_down_cluster_2 GO_Cellular_Component_2018 MHC class II protein complex (GO:0042613) 6/14 3.02060205795267e-10 CD74;HLA-DMA;HLA-DPB1;HLA-DRA;HLA-DRB1;HLA-DPA1
DEG_down_cluster_3 GO_Molecular_Function_2018 MHC class II protein complex binding (GO:0023026) 3/16 0.000737638737958742 CD74;HLA-DRA;HLA-DRB1
DEG_down_cluster_3 GO_Biological_Process_2018 antigen processing and presentation of exogenous peptide antigen (GO:0002478) 6/97 3.97837837930265e-06 CD74;AP1S2;HLA-DRA;CTSS;HLA-DRB1;HLA-DPA1
DEG_down_cluster_3 GO_Cellular_Component_2018 MHC class II protein complex (GO:0042613) 4/14 4.86393392185293e-07 CD74;HLA-DRA;HLA-DRB1;HLA-DPA1
DEG_down_cluster_4 GO_Molecular_Function_2018 actin binding (GO:0003779) 13/254 0.000300156121957636 ITGB1;CAP1;ARPC1B;DSTN;IQGAP2;SYNE2;TMSB4X;MYH9;FLNA;LCP1;PFN1;ALDOA;MYO1F
DEG_down_cluster_4 GO_Biological_Process_2018 T cell activation (GO:0042110) 15/88 4.03909158166256e-12 MSN;CD3G;CD3E;SLA2;CD3D;CD2;ZAP70;PTPRC;CD8B;LCK;CD8A;CD7;FYN;LCP1;LAT
DEG_down_cluster_4 GO_Cellular_Component_2018 T cell receptor complex (GO:0042101) 8/16 1.19179622306178e-10 ZAP70;CD6;CD8B;CD8A;CD3G;CD247;CD3E;CD3D
DEG_down_cluster_5 GO_Molecular_Function_2018 RNA binding (GO:0003723) 57/1387 7.59568149955502e-23 RPL5;RPL30;RPL3;RPL32;RPL31;RPLP0;RPL10A;SYNE1;RPS15;RPS4X;RPS14;RPL7A;RPS16;RPS19;RPS18;RPL36;RPL38;RPL37;RPS10;RPS7;RPS8;RPS5;RPS6;CIRBP;RPL13A;RPSA;RPL37A;RPL29;DDX6;RPL10;RPL12;RPL11;RPS4Y1;ZFP36L2;RPS15A;RPS3;RPL14;RPL15;RPS2;RPS27A;RPL18;LYAR;RPL19;RBM38;HSPA8;RPL41;NPM1;RPL35A;RPL23A;RPS26;RPS25;RPS28;RPS27;CALR;FAU;RPS21;NOP53
DEG_down_cluster_5 GO_Biological_Process_2018 SRP-dependent cotranslational protein targeting to membrane (GO:0006614) 48/89 2.88303170655403e-75 RPL5;RPL30;RPL3;RPL32;RPL10;RPL31;RPL12;RPLP0;RPL11;RPL10A;RPS4Y1;RPS4X;RPS15;RPL7A;RPS14;RPS16;RPS15A;RPS19;RPS18;RPL36;RPL14;RPS3;RPLP2;RPL38;RPL37;RPL15;RPS2;RPL18;RPS27A;RPS10;RPL19;RPL41;RPS7;RPS8;RPS5;RPS6;RPL13A;RPL35A;RPSA;RPL23A;RPS26;RPS25;RPS28;RPS27;RPS29;RPL37A;RPL29;RPS21
DEG_down_cluster_5 GO_Cellular_Component_2018 cytosolic ribosome (GO:0022626) 49/124 3.00367186609762e-69 RPL5;RPL30;RPL3;RPL32;RPL31;RPLP0;RPL10A;RPS15;RPS4X;RPS14;RPL7A;RPS16;RPS19;RPL36AL;RPS18;RPL36;RPLP2;RPL38;RPL37;RPS10;RPS7;RPS8;RPS5;RPS6;RPL13A;RPSA;RPL37A;RPL29;RPL10;RPL12;RPL11;RPS4Y1;RPS15A;RPL14;RPS3;RPL15;RPS2;RPL18;RPS27A;RPL19;RPL41;RPL35A;RPL23A;RPS26;RPS25;RPS28;RPS27;RPS29;RPS21
DEG_down_cluster_6 GO_Molecular_Function_2018 MHC class II protein complex binding (GO:0023026) 3/16 0.00193713461963498 CD74;HLA-DRA;HLA-DRB1
DEG_down_cluster_6 GO_Biological_Process_2018 antigen processing and presentation of exogenous peptide antigen (GO:0002478) 5/97 0.00159727585213224 CD74;HLA-DRA;CTSS;HLA-DRB1;HLA-DPA1
DEG_down_cluster_6 GO_Cellular_Component_2018 MHC class II protein complex (GO:0042613) 4/14 1.81644797070467e-06 CD74;HLA-DRA;HLA-DRB1;HLA-DPA1
DEG_down_cluster_7 GO_Molecular_Function_2018 MHC class II protein complex binding (GO:0023026) 3/16 0.00236424694205082 CD74;HLA-DRA;HLA-DRB1
DEG_down_cluster_7 GO_Biological_Process_2018 cytokine-mediated signaling pathway (GO:0019221) 9/633 0.00224498626907092 IFITM3;NFKBIA;HLA-DRA;FOS;LTB;VIM;IL7R;JUNB;HLA-DRB1
DEG_down_cluster_7 GO_Cellular_Component_2018 MHC class II protein complex (GO:0042613) 3/14 0.000596771775624343 CD74;HLA-DRA;HLA-DRB1
DEG_down_cluster_8 GO_Molecular_Function_2018 cadherin binding (GO:0045296) 13/313 4.36984821605319e-05 ITGB1;ANXA1;HSPA5;ANXA2;AHNAK;IQGAP1;EFHD2;FLNA;TAGLN2;PFN1;EZR;CLIC1;S100A11
DEG_down_cluster_8 GO_Biological_Process_2018 platelet degranulation (GO:0002576) 12/124 4.2361841852672e-08 LYN;SRGN;CD63;TGFB1;APLP2;PSAP;PLEK;ANXA5;FLNA;TAGLN2;CTSW;TUBA4A
DEG_down_cluster_8 GO_Cellular_Component_2018 focal adhesion (GO:0005925) 18/356 1.25359110771497e-09 ITGB1;ANXA1;TPM4;HSPA5;AHNAK;CD81;ARPC1B;ANXA5;IQGAP1;ACTB;ARPC2;CAPN2;FLNA;CALR;LCP1;PFN1;EZR;CD99
DEG_down_cluster_9 GO_Molecular_Function_2018 MHC class II protein complex binding (GO:0023026) 4/16 2.28220256145266e-05 CD74;HLA-DMA;HLA-DRA;HLA-DRB1
DEG_down_cluster_9 GO_Biological_Process_2018 antigen processing and presentation of exogenous peptide antigen (GO:0002478) 8/97 5.04886106824468e-08 CD74;HLA-DMA;HLA-DPB1;HLA-DRA;CTSD;CTSS;HLA-DRB1;HLA-DPA1
DEG_down_cluster_9 GO_Cellular_Component_2018 MHC class II protein complex (GO:0042613) 6/14 4.11555274787584e-11 CD74;HLA-DMA;HLA-DPB1;HLA-DRA;HLA-DRB1;HLA-DPA1
DEG_down_cluster_10 GO_Molecular_Function_2018 RNA binding (GO:0003723) 42/1387 1.73489121755495e-13 RPL5;RPL30;RPL3;RPL32;RPL10;RPL31;RPLP0;RPL10A;RPL9;ZFP36L2;SYNE1;RPS4X;RPS14;RPL7A;RPS15A;RPS18;RPS3;RPL14;RPL13;RPL37;RPS27A;LYAR;RPL19;RBM38;JUN;NPM1;RPS7;RPS8;RPS5;RPS6;RPL13A;RPL35A;RPSA;RPS3A;RPL23A;RPS26;RPS25;RPS28;RPS27;RPL37A;RPS20;RPS21
DEG_down_cluster_10 GO_Biological_Process_2018 SRP-dependent cotranslational protein targeting to membrane (GO:0006614) 39/89 2.83100252338431e-58 RPL5;RPL30;RPL3;RPL32;RPL10;RPL31;RPLP1;RPLP0;RPL10A;RPL9;RPS4X;RPL7A;RPS14;RPS15A;RPS18;RPL14;RPS3;RPLP2;RPL13;RPL37;RPS27A;RPL19;RPS7;RPS8;RPS5;RPS6;RPL13A;RPL35A;RPS3A;RPSA;RPL23A;RPS26;RPS25;RPS28;RPS27;RPS29;RPL37A;RPS20;RPS21
DEG_down_cluster_10 GO_Cellular_Component_2018 cytosolic ribosome (GO:0022626) 39/124 2.05288819641447e-52 RPL5;RPL30;RPL3;RPL32;RPL10;RPL31;RPLP1;RPLP0;RPL10A;RPL9;RPS4X;RPS14;RPL7A;RPS15A;RPS18;RPL14;RPS3;RPLP2;RPL13;RPL37;RPS27A;RPL19;RPS7;RPS8;RPS5;RPS6;RPL13A;RPL35A;RPSA;RPS3A;RPL23A;RPS26;RPS25;RPS28;RPS27;RPS29;RPL37A;RPS20;RPS21
DEG_down_cluster_11 GO_Molecular_Function_2018 RNA binding (GO:0003723) 242/1387 3.33622644554002e-104 RPL4;EIF4A2;RPL5;EIF4A1;RPL30;RPL3;RPL32;RPL31;RPL34;HNRNPU;HNRNPR;RPL8;RPL9;RPL6;RPL7;RPS15;RPS14;PNN;RPS17;RPS16;LGALS1;RPL18A;RPS19;SNRPD2;RPS18;SNRPD1;RPL36;RPL35;ARL6IP4;RPL38;RPL37;RPS11;RPL39;RPS10;RPS13;DDX17;SRRM2;RPL21;SCAF11;RPL23;RPL22;CIRBP;SRRM1;FAM133B;DDX39A;SUB1;RPL24;RPL27;RPL26;RPL29;EZR;RPL28;SF1;BAZ2A;ZC3HAV1;RPS4Y1;SAMSN1;MRPL10;MTDH;DHX36;LYAR;BTF3;RPL41;JUN;PRPF38B;XRCC6;PRRC2C;PA2G4;HNRNPL;RPS26;BST2;HNRNPM;RPS25;RPS28;RPS27;RPL27A;HNRNPD;RPS20;HNRNPC;CALR;RPS21;RPS24;RPS23;SMG1;RPLP0;C1QBP;RACK1;SEC61B;ANXA2;PRPF40A;PPHLN1;NSA2;HNRNPH1;CSDE1;CANX;ARHGEF1;PABPC1;SF3B2;RPL10;AHNAK;RNMT;RPL12;RPL11;EXOSC6;BCLAF1;RPS15A;SRP72;TCOF1;RPL14;RPS3;RPL13;RPS2;RPL15;RPL18;RPS27A;SF3B1;SRSF10;SRSF11;RPL19;NPM1;RSRC2;KTN1;RSL1D1;RPL22L1;FAU;PEBP1;NUCKS1;RPL10A;BZW1;FBL;ZFP36;ZNF207;HMGN2;CAST;PNISR;RPS9;RPS7;RPS8;RPS5;RPS6;NOSIP;RPSA;PRPF4B;ILF2;TUFM;EEF1A1;ILF3;HNRNPUL1;NCL;SRSF2;SNRPG;SBDS;S100A4;ERH;SRSF5;SNRPF;PPIG;SNRNP200;PPIB;SRSF7;PPIA;KPNB1;KHDRBS1;DDX6;DDX5;CORO1A;ZFP36L2;HSP90B1;HNRNPDL;TPR;UBC;H1FX;HSPA9;HSPA8;FUS;NONO;TBCA;EEF2;LSM3;LSM8;SON;EIF2S3;CNOT1;UBE2N;RSL24D1;RBM25;HSP90AB1;DDX3X;CELF1;CELF2;ADAR;CHD2;SYNE1;RPS4X;RBM3;RPL7A;SUMO1;RBM5;HSP90AA1;RPL13A;RPS3A;SFPQ;NEMF;SYF2;EEF1D;THRAP3;RPL37A;LUC7L3;SNRPA1;DDX24;DDX21;U2AF1;SAMHD1;IFI16;PABPN1;TRA2A;EIF4H;SAP18;HNRNPA1;EIF4B;RBM39;HNRNPA3;NOP58;CNBP;CCDC59;RPL35A;RPL23A;HSPE1;EIF3L;EIF3G;EIF3H;EIF3E;VIM;SSBP1;NOP53;EIF3D;RBMX;TPT1;EIF3A;TNRC6A;RAN;TNRC6B
DEG_down_cluster_11 GO_Biological_Process_2018 protein targeting to ER (GO:0045047) 81/97 3.85134564072111e-99 RPL4;RPL5;RPL30;RPL3;RPL32;RPL31;RPL34;RPLP1;RPLP0;RPL8;RPL10A;RPL9;RPL6;RPL7;RPS15;RPS4X;RPS14;RPL7A;RPS17;RPS16;RPS19;RPL18A;RPS18;SEC61G;RPL36;RPL35;RPLP2;RPL38;RPL37;SEC62;RPS11;RPL39;RPS10;RPS13;RPS9;RPL21;RPS7;RPS8;RPL23;RPS5;RPL22;RPS6;RPL13A;RPSA;RPS3A;RPL37A;RPL24;RPL27;CHMP4A;RPL26;RPL29;RPL28;UBA52;RPL10;RPL12;RPL11;RPS4Y1;RPS15A;SRP72;RPL14;RPS3;RPL13;RPL15;RPS2;RPL18;RPS27A;RPL19;RPL41;RPL35A;RPL23A;RPS26;RPS25;RPS28;SPCS2;RPS27;RPS29;RPL27A;RPS20;RPS21;RPS24;RPS23
DEG_down_cluster_11 GO_Cellular_Component_2018 cytosolic ribosome (GO:0022626) 81/124 7.88180682609739e-85 RPL4;RPL5;RPL30;RPL3;DDX3X;RPL32;RPL31;RPL34;RPLP1;RPLP0;RPL8;RPL10A;RPL9;RPL6;RPL7;RPS15;RPS4X;RPS14;RPL7A;RPS17;RPS16;RPS19;RPL18A;RPL36AL;RPS18;RPL36;RACK1;RPL35;RPLP2;RPL38;RPL37;RPS11;RPL39;RPS10;RPS13;RPS9;RPL21;RPS7;RPS8;RPL23;RPS5;RPL22;RPS6;RPL13A;RPSA;RPS3A;RPL37A;RPL24;RPL27;RPL26;RPL29;RPL28;RPL10;RPL12;RPL11;RPS4Y1;RPS15A;RPL14;RPS3;RPL13;RPL15;RPS2;RPL18;RPS27A;RPL19;RPL41;RPL35A;RPL23A;RPS26;RPS25;RSL1D1;RPS28;RPS27;RPS29;RPL27A;RPS20;RPL22L1;RPS21;RSL24D1;RPS24;RPS23
DEG_down_cluster_12 GO_Molecular_Function_2018 T cell receptor binding (GO:0042608) 4/6 5.4641345261933e-06 LCK;CD3G;CD3E;HLA-E
DEG_down_cluster_12 GO_Biological_Process_2018 T cell activation (GO:0042110) 11/88 5.98573039475656e-10 CD2;ZAP70;CD8B;LCK;CD8A;CD7;RHOH;CD3G;FYN;CD3E;CD3D
DEG_down_cluster_12 GO_Cellular_Component_2018 T cell receptor complex (GO:0042101) 8/16 4.66933721922445e-13 ZAP70;CD6;CD8B;CD8A;CD3G;CD247;CD3E;CD3D

Loupe Cell Browser integration

We export the UMAP 2D visualisation, metadata such as the cell clusters, and lists of differentially expressed genes, so you can open and work with these in the Loupe Cell Browser.

# Export UMAP coordinates
loupe_umap = as.data.frame(sc@reductions$umap@cell.embeddings)
loupe_umap = cbind(Barcode=rownames(loupe_umap), loupe_umap)
colnames(loupe_umap) = c("Barcode", "UMAP-1", "UMAP-2")
write.table(loupe_umap, file=paste0(param$path_out, "/Seurat2Loupe_umap.csv"), col.names=TRUE, row.names=FALSE, quote=FALSE, sep=",")

# Export categorical metadata
loupe_meta = as.data.frame(sc@meta.data)
idx_keep = sapply(1:ncol(loupe_meta), function(x) !is.numeric(loupe_meta[,x]))
loupe_meta = cbind(Barcode=rownames(loupe_meta), loupe_meta[, idx_keep])
write.table(x=loupe_meta, file=paste0(param$path_out, "/Seurat2Loupe_metadata.csv"), col.names=TRUE, row.names=FALSE, quote=FALSE, sep=",")

# Export gene sets
loupe_genesets = data.frame(List=paste0("DEG_up_cluster_", sc_markers_filt_up[,"cluster"]), 
                            Name=sc_markers_filt_up[,"gene"], 
                            Ensembl=seurat_rowname_to_ensembl[sc_markers_filt_up[,"gene"]])
loupe_genesets = rbind(loupe_genesets, 
                       data.frame(List=paste0("DEG_down_cluster_", sc_markers_filt_down[,"cluster"]), 
                                  Name=sc_markers_filt_down[,"gene"], 
                                  Ensembl=seurat_rowname_to_ensembl[sc_markers_filt_down[,"gene"]]))

genesets_to_export = list(genes_cc_s_phase=genes_s[,2], genes_cc_g2m_phase=genes_g2m[,2])
for (i in names(genesets_to_export)) {
  tmp_genes = genesets_to_export[[i]]
  tmp_genes = tmp_genes[tmp_genes %in% names(symbol_to_ensembl)]
  loupe_genesets = rbind(loupe_genesets,
                         data.frame(List=i,
                                    Name=tmp_genes,
                                    Ensembl=seurat_rowname_to_ensembl[tmp_genes]))
}

write.table(loupe_genesets, file=paste0(param$path_out, "/Seurat2Loupe_genesets.csv"), col.names=TRUE, row.names=FALSE, quote=FALSE, sep=",")

Output files

All files generated with this report are written into the provided output folder test_datasets/10x_SmartSeq2_pbmc_GSE132044/results/:

Software versions

This report was generated using the scrnaseq GitHub repository. Software versions were collected at run time.

Name Version
ktrns/scrnaseq 2c34e505a1815112ea1570df340f4bb319e8b0e6
R R version 3.6.1 (2019-07-05)
Platform x86_64-apple-darwin15.6.0 (64-bit)
Operating system macOS Mojave 10.14.6
Packages abind1.4-5, AnnotationDbi1.46.1, ape5.3, assertthat0.2.1, backports1.1.5, bibtex0.4.2, Biobase2.44.0, BiocGenerics0.30.0, BiocParallel1.18.1, biomaRt2.40.5, bit1.1-14, bit640.9-7, bitops1.0-6, blme1.0-4, blob1.2.0, boot1.3-23, caTools1.17.1.3, cli2.0.2, cluster2.1.0, codetools0.2-16, colorspace1.4-1, cowplot1.0.0, crayon1.3.4, curl4.3, data.table1.12.6, DBI1.0.0, DelayedArray0.10.0, digest0.6.25, dplyr0.8.3, enrichR2.1, evaluate0.14, fansi0.4.0, farver2.0.1, fitdistrplus1.0-14, future1.15.1, future.apply1.3.0, gdata2.18.0, GenomeInfoDb1.20.0, GenomeInfoDbData1.2.1, GenomicRanges1.36.1, ggplot23.2.1, ggrepel0.8.1, ggridges0.5.1, ggsci2.9, globals0.12.5, glue1.4.1, gplots3.0.1.1, gridExtra2.3, gtable0.3.0, gtools3.8.1, highr0.8, hms0.5.2, htmltools0.4.0, htmlwidgets1.5.1, httr1.4.1, ica1.0-2, igraph1.2.4.2, IRanges2.18.3, irlba2.3.3, jsonlite1.6.1, kableExtra1.1.0, KernSmooth2.23-16, knitcitations1.0.10, knitr1.26, labeling0.3, lattice0.20-38, lazyeval0.2.2, leiden0.3.1, lifecycle0.1.0, listenv0.8.0, lme41.1-21, lmtest0.9-37, lsei1.2-0, lubridate1.7.4, magrittr1.5, MASS7.3-51.4, Matrix1.2-18, matrixStats0.55.0, memoise1.1.0, minqa1.2.4, munsell0.5.0, nlme3.1-142, nloptr1.2.1, npsurv0.4-0, openxlsx4.1.4, patchwork1.0.0, pbapply1.4-2, pillar1.4.2, pkgconfig2.0.3, plotly4.9.1, plyr1.8.4, png0.1-7, prettyunits1.0.2, progress1.2.2, purrr0.3.3, R62.4.1, RANN2.6.1, RColorBrewer1.1-2, Rcpp1.0.3, RcppAnnoy0.0.14, RcppParallel4.4.4, RCurl1.95-4.12, readr1.3.1, RefManageR1.2.12, reshape21.4.3, reticulate1.13, rjson0.2.20, rlang0.4.6, rmarkdown1.18, ROCR1.0-7, RSQLite2.1.4, rstudioapi0.11, rsvd1.0.2, Rtsne0.15, rvest0.3.5, S4Vectors0.22.1, scales1.1.0, sctransform0.2.0, sessioninfo1.1.1, Seurat3.1.5, SingleCellExperiment1.6.0, stringi1.4.3, stringr1.4.0, SummarizedExperiment1.14.1, survival3.1-8, tibble2.1.3, tidyr1.0.0, tidyselect0.2.5, tsne0.1-3, uwot0.1.5, vctrs0.2.0, viridisLite0.3.0, webshot0.5.2, withr2.1.2, xfun0.11, XML3.98-1.20, xml21.2.2, XVector0.24.0, yaml2.2.0, zeallot0.1.0, zip2.0.4, zlibbioc1.30.0, zoo1.8-6

References

Chen, Edward Y. 2020. “Enrichr.” . https://amp.pharm.mssm.edu/Enrichr/.

Gandolfo, Luke C., and Terence P. Speed. 2018. “RLE Plots: Visualizing Unwanted Variation in High Dimensional Data.” Edited by Enrique Hernandez-Lemus. PLOS ONE 13 (2). Public Library of Science (PLoS): e0191629. https://doi.org/10.1371/journal.pone.0191629.

Hafemeister, Christoph, and Rahul Satija. 2019. “Normalization and Variance Stabilization of Single-Cell RNA-Seq Data Using Regularized Negative Binomial Regression.” Genome Biology 20 (1). Springer Science; Business Media LLC. https://doi.org/10.1186/s13059-019-1874-1.

Liu, Fenglin, Yuanyuan Zhang, Lei Zhang, Ziyi Li, Qiao Fang, Ranran Gao, and Zemin Zhang. 2019. “Systematic Comparative Analysis of Single-Nucleotide Variant Detection Methods from Single-Cell RNA Sequencing Data.” Genome Biology 20 (1). Springer Science; Business Media LLC. https://doi.org/10.1186/s13059-019-1863-4.

Tirosh, I., B. Izar, S. M. Prakadan, M. H. Wadsworth, D. Treacy, J. J. Trombetta, A. Rotem, et al. 2016. “Dissecting the Multicellular Ecosystem of Metastatic Melanoma by Single-Cell RNA-Seq.” Science 352 (6282). American Association for the Advancement of Science (AAAS): 189–96. https://doi.org/10.1126/science.aad0501.